home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Celestin Apprentice 5
/
Apprentice-Release5.iso
/
Source Code
/
PowerPlant
/
DMultiStringLocator
/
source
/
DMultiStringLocator.cp
< prev
next >
Wrap
Text File
|
1996-07-06
|
14KB
|
469 lines
// ===========================================================================
// DMultiStringLocator.cp
// ---------------------
// ©1996 Eric Gundrum, All rights reserved.
//
// The contents of this file may be freely altered and freely distributed
// in any form, provided this copyright statement is retained unaltered.
// Add your own changes below.
// ---------------------
//
// Based on source code provided in Practical Algorithms for Programmers,
// by Binstock and Rex, published in 1995 by Addison Wesley.
//
// Simultaneously search text for multiple strings using the Aho/Corasick
// algorithm.
//
// Mac Headers
#include <Types.h> // for NULL
// PowerPlant Headers
#include <LListIterator.h>
#include <UDebugging.h>
// Library Headers
#include "DMultiStringLocator.h"
// ===========================================================================
// ----------- public member function implementations ----------
// ===========================================================================
#pragma mark --- public members ---
// ---------------------------------------------------------------------------
// Constructor:
// Create state tables used by the search.
//
// inStringList is a list of pointers to DSearchStrings.
// Ownership of the contents of inStringList is NOT transferred. The contents of
// inStringList must exist as long as this instance of DMultiStringLocator exists.
DMultiStringLocator::DMultiStringLocator
( LList &inStringList // list of search strings
, int inAlphabetSize // size of BranchTable
)
: mAlphabetSize( inAlphabetSize )
{
int theChar;
stateT theState;
DSearchString *stringP = NULL;
if ( NULL == &inStringList ) throw ( input_item_NULL() ); // validate input
// compute maximum number of possible states by adding lengths of all strings
mMaxState = 1;
LListIterator stringScan ( inStringList, iterate_FromStart );
while ( stringScan.Next ( &stringP ) )
{
mMaxState += stringP->Length();
}
// alloc arrays
MatchArray = new int[mMaxState]; // list of matching chars
GotoArray = new GotoTable[mMaxState]; // list of next states or branch ptr
// GotoArray is indexed by MatchArray contents
RetryArray = new stateT[mMaxState]; // list of retry states
OutArray = new LList[mMaxState]; // list of pointers to recognizable words
// initialize arrays
for ( theState = state_begin; theState < mMaxState; ++theState )
{
MatchArray[theState] = EMPTY_SLOT;
}
// initialize state_array[state_begin]
mHighState = state_begin;
AddStateTrans ( 'a', state_begin, FAIL_STATE );
AddStateTrans ( 'b', state_begin, FAIL_STATE ); // force a BranchTable
// add each string to the state engine
stringScan.ResetTo ( iterate_FromStart );
while ( true == stringScan.Next ( &stringP ) )
{
if ( NULL != stringP ) AddString ( *stringP );
}
// setup return to zero transitions for state[state_begin]
for ( theChar = 0; theChar < mAlphabetSize; ++theChar )
{
if ( FAIL_STATE == GotoArray[state_begin].BranchTable[theChar] )
{
GotoArray[state_begin].BranchTable[theChar] = state_begin;
}
}
// compute failure array
RetryArrayInit();
}
// ---------------------------------------------------------------------------
// Search a ram-based buffer.
//
// Returns an int indicating the maximum number of characters that could be
// part of a match at the end of the buffer. This indicates what number of
// bytes should be copied to the next buffer if there is another buffer to
// search. This does not account for found strings embedded in the partial
// match string. Such embedded strings will be found twice. Note that this
// problem occurs only when a complete search is split over multiple calls.
//
// (I'm not yet sure how best to correct this design flaw. Currently I ignore
// it, but one could work around it by checking the buffer location of each
// match to see if it is a duplicate.)
int
DMultiStringLocator::SearchBuffer ( Uchar *inBufferP, int inSize )
{
unsigned char theChar; // comparison character
stateT currentState, nextState;
int matchChar, charLength = 0;
Uchar *theBufferP = inBufferP;
Boolean keepGoing = true;
// validate inputs
ThrowIfNil_( inBufferP );
currentState = state_begin;
while ( keepGoing && ( theBufferP < inBufferP + inSize )) // for each character...
{
theChar = *theBufferP++;
charLength += 1; // tracking for partial match at end of buffer
for ( ;; ) // determine the next state
{
if ( ( state_begin == currentState ) ) // branch shortcut
{
charLength = 1; // assumes new strings always start at state_begin
nextState = GotoArray[currentState].BranchTable[theChar];
}
else if ( BRANCH == ( matchChar = MatchArray[currentState] )) // branch
{
nextState = GotoArray[currentState].BranchTable[theChar];
}
else if ( matchChar == theChar ) // single char
{
nextState = GotoArray[currentState].GotoState;
}
else // empty slot
{
nextState = FAIL_STATE;
}
if ( FAIL_STATE != nextState ) break;
currentState = RetryArray[currentState]; // goto another word containing from char
Assert_( currentState < mMaxState );
}
currentState = nextState;
Assert_( currentState < mMaxState );
// scan OutArray to see if a complete word is found
if ( OutArray[currentState].GetCount() > 0 ) // list not empty
{
// report for each string of the recognized string list
DSearchString *reportItemP = NULL;
LListIterator foundList ( OutArray[currentState], iterate_FromStart );
while ( foundList.Next ( &reportItemP ) ) // for each output string...
{
if ( NULL == reportItemP ) throw ( list_item_NULL() );
keepGoing = reportItemP->ReportFound( (theBufferP-inBufferP) );
}
}
}
// Pass along the request to cancel the search.
if ( !keepGoing ) charLength = searchStatus_stop;
// Return indication of what part of the buffer should be reused.
return charLength;
}
// ---------------------------------------------------------------------------
// Destructor
DMultiStringLocator::~DMultiStringLocator()
{
stateT theState;
// search and destroy BranchTables
for ( theState = state_begin; theState < mMaxState; ++theState )
{
if ( BRANCH == MatchArray[theState] )
{
delete[] ( GotoArray[theState].BranchTable );
}
}
// destroy other arrays
delete[] ( MatchArray );
delete[] ( GotoArray );
delete[] ( RetryArray );
delete[] ( OutArray ); // listed objects are just pointers
}
// ===========================================================================
// ----------- private member function implementations ----------
// ===========================================================================
#pragma mark --- private members ---
// ---------------------------------------------------------------------------
// Add inString to list of recognizable strings
void
DMultiStringLocator::AddString( DSearchString &inString )
{
stateT currentState, nextState;
Uint8 charPos;
currentState = state_begin;
if ( NULL == &inString ) throw ( input_item_NULL() ); // validate input
// attempt to place new word on an old word
for ( charPos = 1; charPos <= inString.Length(); ++charPos ) // for each char...
{
if ( inString[charPos] == MatchArray[currentState] ) // single char slot
{
currentState = GotoArray[currentState].GotoState;
Assert_( currentState < mMaxState );
}
else if ( BRANCH == MatchArray[currentState] ) // branch
{
if ( FAIL_STATE ==
( nextState = GotoArray[currentState].BranchTable[inString[charPos]] ))
{
break; // char not part of a stored word
}
else // a transition for this character
{
currentState = nextState;
}
Assert_( currentState < mMaxState );
}
else // empty slot
{
break; // no match for this character
}
}
// add new states as needed
for ( ; charPos <= inString.Length(); ++charPos ) // for each char...
{
mHighState += 1;
if ( mHighState >= mMaxState ) throw ( state_table_exceeded() );
AddStateTrans ( inString[charPos], currentState, mHighState );
currentState = mHighState;
}
// Place a copy of the pointer to the string in the output list
if ( 0 < inString.Length() )
{
OutArray[currentState].InsertItemsAt( 1, arrayIndex_Last, &(&inString) );
}
}
// ---------------------------------------------------------------------------
// Add transition for matchChar from currentState to nextState
void
DMultiStringLocator::AddStateTrans ( int matchChar, stateT currentState, stateT nextState )
{
int theChar;
stateT *newBranchTable;
// validate inputs
Assert_( currentState < mMaxState );
Assert_( nextState < mMaxState );
if ( EMPTY_SLOT == MatchArray[currentState] ) // empty slot
{
MatchArray[currentState] = matchChar;
GotoArray[currentState].GotoState = nextState;
}
else if ( BRANCH == MatchArray[currentState] ) // branch
{
GotoArray[currentState].BranchTable[matchChar] = nextState;
}
else // single char
{
newBranchTable = new stateT[mAlphabetSize];
for ( theChar = 0; theChar < mAlphabetSize; ++theChar )
{
newBranchTable[theChar] = FAIL_STATE;
}
// copy data from single-way branch to char position of newBranchTable
newBranchTable [ MatchArray [ currentState ] ]
= GotoArray[ currentState ].GotoState;
// copy new data
newBranchTable[matchChar] = nextState;
// load state_array
MatchArray[currentState] = BRANCH;
GotoArray[currentState].BranchTable = newBranchTable;
}
}
// ---------------------------------------------------------------------------
// build RetryArray and update GotoArray
void
DMultiStringLocator::RetryArrayInit( void )
{
stateT *fifo, fifoTop;
stateT nextState, currentState;
int theChar;
// create fifo
fifo = new stateT[mMaxState]; // a FIFO list of possible states
if ( NULL == fifo ) throw ( memerror() );
fifoTop = 0;
fifo[fifoTop] = 0;
// initialize RetryArray with first level BranchTable states
for ( theChar = 0; theChar < mAlphabetSize; ++theChar )
{
if ( state_begin !=
( currentState = GotoArray[state_begin].BranchTable[theChar] ))
{
Assert_( currentState < mMaxState );
RetryArray[currentState] = state_begin;
QueueAdd (fifo, fifoTop, currentState );
}
}
// update RetryArray with states from scan of lower levels
while ( 0 != fifo[fifoTop] ) // for each state in the fifo...
{
// grab state from front of fifo and advance fifoTop to next item
nextState = fifo[fifoTop];
fifoTop = nextState; // effectively drops retrieved item from the fifo
Assert_( nextState < mMaxState );
// examine this state
if ( EMPTY_SLOT == MatchArray[nextState] )
{
continue;
}
else if ( BRANCH == MatchArray[nextState] )
{
// scan subsidiary states
for ( theChar = 0; theChar < mAlphabetSize; ++theChar ) // scan branch table
{
if ( FAIL_STATE !=
( currentState = GotoArray[nextState].BranchTable[theChar] ))
{
Assert_( currentState < mMaxState );
// add new state to the fifo
QueueAdd ( fifo, fifoTop, currentState );
FindRetryState ( theChar, RetryArray[nextState], currentState );
}
}
}
else // single character
{
QueueAdd ( fifo, fifoTop, GotoArray[nextState].GotoState );
FindRetryState ( MatchArray[nextState]
, RetryArray[nextState], GotoArray[nextState].GotoState );
}
}
delete[] ( fifo );
}
// ---------------------------------------------------------------------------
// add inItem to end of fifo (a FIFO list)
// fifo is actually a list implemented in an array. Each item points to the
// next intended item. Items are not necessarily added in order (maybe?).
void
DMultiStringLocator::QueueAdd( int *fifo, int fifoTop, stateT inState )
{
const stateT fifo_end = 0;
stateT nextState;
// validate inputs
Assert_( inState < mMaxState );
if ( NULL == fifo ) throw ( input_item_NULL() );
nextState = fifo[fifoTop];
if ( fifo_end == nextState ) // list is empty
{
fifo[fifoTop] = inState;
}
else
{
for ( ; fifo_end != fifo[nextState]; nextState = fifo[nextState] )
{
; // walk to the next open slot
Assert_( nextState < mMaxState ); // don't walk off the end of the list
}
fifo[nextState] = inState; // insert state
}
fifo[inState] = fifo_end; // terminate list
}
// ---------------------------------------------------------------------------
// Compute failure transition.
//
// inChar would normally cause a change from currentState to nextState. To
// compute the retry value, look back in search of other places inChar might go.
//
// This appears to assume inChar is either a single char match or a branch.
void
DMultiStringLocator::FindRetryState
( const int inChar
, stateT currentState
, const stateT nextState
)
{
stateT on_fail;
// validate inputs
Assert_( currentState < mMaxState );
Assert_( nextState < mMaxState );
// locate embedded words by following RetryArray until it does not link to FAIL_STATE
for ( ; ; currentState = RetryArray[currentState] )
{
Assert_( currentState < mMaxState );
if ( inChar == MatchArray[currentState] ) // desired single char
{
if ( FAIL_STATE != ( on_fail = GotoArray[currentState].GotoState )) break;
}
else if ( BRANCH == MatchArray[currentState] ) // branch
// 960630: Original code (below) fails when searching for { nil, int, Item }.
// else if ( EMPTY_SLOT != MatchArray[currentState] )
// Why was this not an explicit BRANCH test?
{
if ( FAIL_STATE !=
( on_fail = GotoArray[currentState].BranchTable[inChar] )) break;
}
}
Assert_( on_fail < mMaxState );
RetryArray[nextState] = on_fail;
// --- merge output lists
// duplicate each item of OutArray[on_fail] to end of list OutArray[nextState]
// ••• When can there every really be two words with the same end state?
if ( OutArray[on_fail].GetCount() > 0 ) // list not empty
{
void *itemP = NULL;
LListIterator failList ( OutArray[on_fail], iterate_FromStart );
while ( failList.Next ( &itemP ) ) // for each on_fail output string...
{
OutArray[nextState].InsertItemsAt( 1, arrayIndex_Last, &itemP );
}
}
}
// ===========================================================================